#Airbnb ML challenge
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('data/'):
for filename in filenames:
print(os.path.join(dirname, filename))
calendar_data_path=os.path.join(dirname, 'calendar.csv')
listings_data_path=os.path.join(dirname, 'listings.csv')
reviews_data_path=os.path.join(dirname, 'reviews.csv')
calendar=pd.read_csv(calendar_data_path)
listings=pd.read_csv(listings_data_path)
reviews=pd.read_csv(reviews_data_path)
listings.head(2)
listings_df=listings[['price','accommodates','bathrooms','bedrooms','beds','bed_type','room_type','property_type']]
#listings_df=listings[['id','host_id','reviews_per_month','price','accommodates','bathrooms','bedrooms','beds','bed_type','room_type','property_type']]
listings_df.head()
listings_df_cleaned=listings_df.dropna()
#Percent of rows dropped after cleaning
((len(listings_df)-len(listings_df_cleaned))/len(listings_df))*100
listings_df_cleaned.head()
listings_df_cleaned['price']=listings_df_cleaned['price'].str.replace("[$, ]", "").astype("float")
listings_df_cleaned.head()
listings_df_cleaned.bed_type.unique()
listings_df_cleaned.room_type.unique()
listings_df_cleaned.property_type.unique()
import plotly.express as px
fig = px.scatter(listings_df_cleaned, x="accommodates", y="price",title="Accommodates vs Price")
fig.show()
fig = px.scatter(listings_df_cleaned, x="bedrooms", y="price",title="Bedrooms vs Price")
fig.show()
fig = px.scatter(listings_df_cleaned, x="bathrooms", y="price",title="Bathrooms vs Price")
fig.show()
fig = px.scatter(listings_df_cleaned, x="beds", y="price",title="beds vs Price")
fig.show()
fig = px.scatter(listings_df_cleaned, x="bed_type", y="price",title="Bed Type vs Price")
fig.show()
fig = px.scatter(listings_df_cleaned, x="room_type", y="price",title="Room Type vs Price")
fig.show()
fig = px.scatter(listings_df_cleaned, x="property_type", y="price",title="Property Type vs Price")
fig.show()
#One hot encoding of some of the categorical non-numerical data
listings_df_cleaned = pd.concat((listings_df_cleaned,pd.get_dummies(listings_df_cleaned.room_type,prefix='room-type')),1)
listings_df_cleaned = pd.concat((listings_df_cleaned,pd.get_dummies(listings_df_cleaned.bed_type,prefix='bed_type')),1)
listings_df_cleaned = pd.concat((listings_df_cleaned,pd.get_dummies(listings_df_cleaned.property_type,prefix='property_type')),1)
#Remove One Hot Encoded columns
listings_df_cleaned=listings_df_cleaned.drop(columns=['bed_type','room_type','property_type'])
listings_df_cleaned.head()
from sklearn import linear_model
listings_df_cleaned.loc[:, listings_df_cleaned.columns != 'price']
reg = linear_model.LinearRegression()
reg.fit(listings_df_cleaned.loc[:, listings_df_cleaned.columns != 'price'],listings_df_cleaned.price)
#reg.fit(listings_df_cleaned[['accommodates','bathrooms','bedrooms','beds']],listings_df_cleaned.price)